#!/usr/bin/env python
# coding=utf-8
# __author__ = 'Yunchao Ling'

import sys
import nltk

def remove_stopwords(words,stopwords):
    result=[]
    for word in words:
        if not word in stopwords:
            result.append(word)
    return result

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

snowball=nltk.stem.SnowballStemmer('english')
stopwords=nltk.corpus.stopwords.words('english')
# print stopwords

infile1=open("D:/data/shanghai_org.txt","r")
for line in infile1:
    line=line.rstrip()
    # tokens=nltk.word_tokenize(line)
    tokens=nltk.tokenize.TweetTokenizer().tokenize(line)
    stems_snowball=[snowball.stem(t) for t in tokens]
    result_tokens=remove_stopwords(stems_snowball,stopwords)

    print line
    print tokens
    print stems_snowball
    print result_tokens
    print "================================================"
infile1.close()
